/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.lmf.api; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.hibernate.Criteria; import org.hibernate.ScrollableResults; import org.hibernate.criterion.MatchMode; import org.hibernate.criterion.Projections; import org.hibernate.criterion.Property; import org.hibernate.criterion.Restrictions; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.LexicalResource; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.enums.ESenseAxisType; import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation; import de.tudarmstadt.ukp.lmf.model.morphology.Lemma; import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis; import de.tudarmstadt.ukp.lmf.model.semantics.SenseRelation; import de.tudarmstadt.ukp.lmf.transform.DBConfig; /** * This class represents an extension of the {@link Uby} class to support collecting resource statistics. * * @author Silvana Hartmann * */ public class UbyStatistics extends Uby{ /** * Creates a {@link UbyStatistics} instance based on the consumed parameter. * * @param dbConfig * Configuration holder of the UBY database used for generating the statistics * * @throws UbyInvalidArgumentException if the provided dbConfig is null * * @see DBConfig */ public UbyStatistics(DBConfig dbConfig) throws IllegalArgumentException { super(dbConfig); } /** * Counts the number of {@link Sense} instances in the {@link Lexicon} * specified by the given name. * * @param lexiconName * name of the lexicon which senses should be counted * * @return the number of senses in the lexicon or zero, if the * lexicon with the specified name does not exist */ public long countSensesPerLexicon(String lexiconName){ Criteria criteria = session.createCriteria(Sense.class); criteria = criteria.createCriteria("lexicalEntry").createCriteria("lexicon"); criteria = criteria.add(Restrictions.eq("name", lexiconName)); long count = (Long) criteria.setProjection(Projections.rowCount()).uniqueResult(); return count; } /** * Counts the number of {@link LexicalEntry} instances in the {@link Lexicon} * specified by the given name. * * @param lexiconName * name of the lexicon which lexical entries should be counted * * @return the number of lexical entries in the lexicon or zero, if the * lexicon with the specified name does not exist */ public long countLexicalEntriesPerLexicon(String lexiconName){ Criteria criteria = session.createCriteria(LexicalEntry.class); criteria = criteria.createCriteria("lexicon"); criteria = criteria.add(Restrictions.eq("name", lexiconName)); long count = (Long) criteria.setProjection(Projections.rowCount()).uniqueResult(); return count; } /** * Count the number of lemma+pos combinations per lexicon * @param lexiconName * Name of the lexicon * @return the number of lemma+pos combinations in the lexicon */ public long countLemmaPosPerLexicon(String lexiconName){ System.out.println(lexiconName); Set<String> l = getLemmaPosPerLexicon(lexiconName); int res = 0; if (!l.isEmpty()){ return l.size(); } return res; // Criteria criteria = session.createCriteria(Lexicon.class,"l"); //// if (pos != null) { //// criteria = criteria.add(Restrictions.eq("partOfSpeech", pos)); //// } // criteria = criteria.createCriteria("lexicalEntries", "e"); // if (lexiconName != null) { // criteria = criteria.add(Restrictions.sqlRestriction("lexiconName = '" // + lexiconName + "'")); // } // criteria = criteria.createCriteria("lemma") // .createCriteria("formRepresentations", "f") // .setProjection(Projections.projectionList() // .add(Property.forName("f.writtenForm")) // .add(Property.forName("e.partOfSpeech"))); //// ProjectionList p = Projections.projectionList().add(Projections.countDistinct("f.writtenForm")).add(Projections.countDistinct("e.partOfSpeech")); // Long count = (Long) criteria.setProjection(Projections.rowCount()).uniqueResult(); // return count; } /** * Count the number of lemma+pos combinations per lexicon, * part-of-speech prefix * and language * @param lexiconName * Name of the lexicon * @param prefix * The partOfSpeech prefix * @param lang * The language identifier of the lexicon * @return the number of lemma+pos combinations */ public long countLemmaPosPerLexiconAndPosPrefixAndLanguage(String lexiconName, String prefix, String lang){ Set<String> l= getLemmaPosPerLexiconAndPosPrefixAndLanguage(lexiconName, prefix, lang); int res = 0; if (!l.isEmpty()){ res = l.size(); } return res; } /** * Return a {@link Set} of {@link String} instances consisting of <code>lemma+"_"+part-of-speech</code>, * filtered by given {@link Lexicon} name.<br> * The lemma is obtained from the written form of the first {@link FormRepresentation} of the {@link Lemma} * instance. * @param lexiconName * name of the lexicon which lemmas should be used * * @return a set of strings containing lemma and part-of-speech of the specified lexicon.<br> * This method returns an empty set if the lexicon with the specified name does no exist. * * @see Lemma#getFormRepresentations() * @see FormRepresentation#getWrittenForm() * @see EPartOfSpeech */ public Set<String> getLemmaPosPerLexicon(String lexiconName){ Criteria criteria = session.createCriteria(Lexicon.class,"l"); criteria = criteria.createCriteria("lexicalEntries", "e"); if (lexiconName != null) { criteria = criteria.add(Restrictions.eq("l.name", lexiconName)); } criteria = criteria.createCriteria("lemma") .createCriteria("formRepresentations", "f") .setProjection(Projections.projectionList() .add(Property.forName("f.writtenForm")) .add(Property.forName("e.partOfSpeech"))); ScrollableResults res = criteria.scroll(); ArrayList<String> out = new ArrayList<String>(); while (res.next()){ Object[] r = res.get(); if (r[1] != null){ // some resources do not have POS out.add((String)r[0] +"_"+((EPartOfSpeech)r[1]).toString()); } else { out.add((String)r[0] +"_null"); } } HashSet<String> out2 = new HashSet<String>(out); return out2; } /** * Return a {@link Set} of {@link String} instances consisting of <code>lemma+"_"+part-of-speech</code>, * filtered by given {@link Lexicon} name, part-of-speech prefix and a language identifier.<br> * The lemma is obtained from the written form of the first {@link FormRepresentation} of the {@link Lemma} * instance. * * @param lexiconName * name of the lexicon which lemmas should be used * * @param prefix the part-of-speech prefix used when filtering {@link LexicalEntry} instances * * @param lang the language identifier used when filtering lexical entries * * @return a set of strings containing lemma and part-of-speech of the specified lexicon.<br> * * This method returns an empty set if the lexicon with the specified name does no exist or * the lexicon does not contain any lexical entries with specified part-of-speech prefix and language * identifier. * * @see Lemma#getFormRepresentations() * @see FormRepresentation#getWrittenForm() * @see EPartOfSpeech * @see ELanguageIdentifier */ public Set<String> getLemmaPosPerLexiconAndPosPrefixAndLanguage(String lexiconName, String prefix, String lang){ Criteria criteria = session.createCriteria(Lexicon.class,"l"); criteria = criteria.createCriteria("lexicalEntries", "e"); if (lexiconName != null) { criteria = criteria.add(Restrictions.eq("l.name", lexiconName)); } if (lang != null) { criteria = criteria.add(Restrictions.eq("l.languageIdentifier", lang)); } if (prefix != null) { criteria = criteria.add(Restrictions.sqlRestriction("partOfSpeech like '"+prefix+"'")); } criteria = criteria.createCriteria("lemma") .createCriteria("formRepresentations", "f") .setProjection(Projections.projectionList() .add(Property.forName("f.writtenForm")) .add(Property.forName("e.partOfSpeech"))); ScrollableResults res = criteria.scroll(); ArrayList<String> out = new ArrayList<String>(); while (res.next()){ Object[] r = res.get(); if (r[1] != null){ out.add((String)r[0]+"_"+((EPartOfSpeech)r[1]).toString()); } else { out.add((String)r[0]+"_null"); } } HashSet<String> out2 = new HashSet<String>(out); return out2; } /** * Counts the number of {@link SenseRelation} instances within the {@link Lexicon} * specified by the given name. * * @param lexiconName * name of the lexicon which senses should be counted * * @return the number of sense relations in the lexicon or zero if the * lexicon with the specified name does not exist */ public long countSenseRelationsPerLexicon(String lexiconName) { Criteria criteria = session.createCriteria(SenseRelation.class); criteria = criteria.createCriteria("source"); criteria = criteria.createCriteria("lexicalEntry"); criteria = criteria.createCriteria("lexicon"); criteria = criteria.add(Restrictions.eq("name", lexiconName)); return (Long) criteria.setProjection(Projections.rowCount()).uniqueResult(); } /** * Counts the number of {@link Sense} instances within the UBY-LMF {@link LexicalResource} * contained in the database accessed by this {@link UbyStatistics} instance. * * @return the number of senses in the lexical resource or zero if the * lexical resource does not contain any senses */ public long countSenses(){ return countClassEntities(Sense.class); } /** * Counts the number of {@link LexicalEntry} instances within the UBY-LMF {@link LexicalResource} * contained in the database accessed by this {@link UbyStatistics} instance. * * @return the number of lexical entries in the lexical resource or zero if the * lexical resource does not contain any lexical entries */ public long countLexicalEntries(){ return countClassEntities(LexicalEntry.class); } /** * Counts the number of {@link SenseAxis} instances within the UBY-LMF {@link LexicalResource} * contained in the database accessed by this {@link UbyStatistics} instance. * * @return the number of sense axes in the lexical resource or zero if the * lexical resource does not contain any sense axes */ public long countSenseAxes() { return countClassEntities(SenseAxis.class); } /** * Counts the number of instances of a specified UBY-LMF class. * The instances are counted within the UBY-LMF {@link LexicalResource} * contained in the database accessed by this {@link UbyStatistics} instance. * * @param ubyClass specifies the UBY-LMF class which instances should be counted * * @return the number of specified UBY-LMF class instances contained in the * lexical resource or zero if the lexical resource does not contain any instances * of the specified class */ public long countClassEntities(@SuppressWarnings("rawtypes") Class ubyClass){ return (Long) session.createCriteria(ubyClass).setProjection(Projections.rowCount()).uniqueResult(); } /** * Counts the number of {@link SenseAxis} instances between two {@link Lexicon} instances * identified by their name. The counted sense axes are filtered by the * specified type.<p> * <b>Important properties of this method:</b> * <ul> * <li>Only alignments between {@link Sense} instances are considered.</li> * <li>The sources of the alignments are not distinguished.</li> * <li>The lexicons are identified by identifier prefixes of the aligned senses.</li> * </ul> * * @param type * Type of sense axes to be considered when counting * * @param lex1Name * The name of the first of two lexicons between which sense axes should be counted * * @param lex2Name * The name of the second of two lexicons between which sense axes should be counted * * @return the number of sense axes between the lexicons filtered by the specified sense axes type. * This method returns zero if a lexicon with the specified name does not exist or one of the * consumed arguments is null. * * @see ESenseAxisType */ public long countSenseAxesPerLexiconPair(ESenseAxisType type, String lex1Name, String lex2Name){ // get prefix for res1Name Criteria c1 = session.createCriteria(Sense.class,"s"); c1 = c1.createCriteria("lexicalEntry"); c1 = c1.createCriteria("lexicon"); c1 = c1.add(Restrictions.eq("name", lex1Name)); c1 = c1.setProjection(Projections.property("s.id")); c1 = c1.setMaxResults(1); String res1 = (String)c1.uniqueResult(); //get prefix for res2Name Criteria c2 = session.createCriteria(Sense.class,"s"); c2 = c2.createCriteria("lexicalEntry"); c2 = c2.createCriteria("lexicon"); c2 = c2.add(Restrictions.eq("name", lex2Name)); c2 = c2.setProjection(Projections.property("s.id")); c2 = c2.setMaxResults(1); String res2 = (String)c2.uniqueResult(); String pref1 = ""; String pref2 = ""; if (res1!=null && res2!=null){ pref1 = res1.split("_")[0]; if (res1.split("_")[1].equals("en") ||res1.split("_")[1].equals("de") ){ pref1 += "_" + res1.split("_")[1]; } pref2 = res2.split("_")[0]; if (res2.split("_")[1].equals("en") || res2.split("_")[1].equals("de") ){ pref2 += "_" + res2.split("_")[1]; } // get alignments with these prefixes Criteria criteria = session.createCriteria(SenseAxis.class); criteria = criteria.add(Restrictions.eq("senseAxisType", type)); criteria = criteria.add(Restrictions.like("senseOne.id", pref1, MatchMode.START)); criteria = criteria.add(Restrictions.like("senseTwo.id", pref2, MatchMode.START)); criteria = criteria.setProjection(Projections.rowCount()); return (Long) criteria.uniqueResult(); } else { return 0L; } } /** * Returns a {@link List} containing all {@link SenseAxis} instances between two {@link Lexicon} instances * identified by their name. The counted sense axes are filtered by the * specified type.<p> * <b>Important properties of this method:</b> * <ul> * <li>Only alignments between {@link Sense} instances are considered.</li> * <li>The sources of the alignments are not distinguished.</li> * <li>The lexicons are identified by identifier prefixes of the aligned senses.</li> * </ul> * * @param type * Type of sense axes to be returned * * @param lex1Name * The name of the first of two lexicons between which sense axes should be found * @param lex2Name * The name of the second of two lexicons between which sense axes should be found * * @return the list of sense axes between the lexicons filtered by the specified sense axes type. * This method returns an empty list if a lexicon with the specified name does not exist or one of * the consumed arguments is null. * * @see ESenseAxisType */ @SuppressWarnings("unchecked") public List<SenseAxis> getSenseAxesPerLexiconPair(ESenseAxisType type, String lex1Name, String lex2Name){ // get prefix for res1Name Criteria c1 = session.createCriteria(Sense.class,"s"); c1 = c1.createCriteria("lexicalEntry"); c1 = c1.createCriteria("lexicon"); c1 = c1.add(Restrictions.eq("name", lex1Name)); c1 = c1.setProjection(Projections.property("s.id")); c1 = c1.setMaxResults(1); String res1 = (String)c1.uniqueResult(); //get prefix for res2Name Criteria c2 = session.createCriteria(Sense.class,"s"); c2 = c2.createCriteria("lexicalEntry"); c2 = c2.createCriteria("lexicon"); c2 = c2.add(Restrictions.eq("name", lex2Name)); c2 = c2.setProjection(Projections.property("s.id")); c2 = c2.setMaxResults(1); String res2 = (String)c2.uniqueResult(); String pref1 = ""; String pref2 = ""; if (res1!=null && res2!=null){ pref1 = res1.split("_")[0]; if (res1.split("_")[1].equals("en") ||res1.split("_")[1].equals("de") ){ pref1 += "_" + res1.split("_")[1]; } pref2 = res2.split("_")[0]; if (res2.split("_")[1].equals("en") || res2.split("_")[1].equals("de") ){ pref2 += "_" + res2.split("_")[1]; } // get alignments with these prefixes Criteria criteria = session.createCriteria(SenseAxis.class); criteria = criteria.add(Restrictions.eq("senseAxisType", type)); criteria = criteria.add(Restrictions.like("senseOne.id", pref1, MatchMode.START)); criteria = criteria.add(Restrictions.like("senseTwo.id", pref2, MatchMode.START)); return criteria.list(); } else return new ArrayList<SenseAxis>(); } }